library(RColorBrewer)
library(ISLR)
library(latex2exp)
library(caret) # loads the knn function
library(boot) # loads the cv.glm function
cols <- brewer.pal(9, "Set1")
We try to answer the question: how to assess model accuracy? Cross-validation can be used to estimate the test error associated with a given statistical learning method in order to evaluate its performance, or to select the appropriate level of flexibility.
First, we need to choose a measure of error for a model. In the regression setting, the most commonly-used measure is the mean squared error (MSE), given by
\[\text{MSE} = \dfrac{1}{n} \sum_{i=1}^{n} \{y_{i} - \hat{f}(\textbf{x}_{i})\}^{2}\] where \(\hat{f}(\textbf{x}_{i})\) is the prediction that \(\hat{f}\) gives for the \(i^{th}\) observation.
Let us see how the bias-variance tradeoff works with a practical example: knn regression.
# True function generating the data
fx <- function(x){
return (0.05 * x^2 - 0.5*x)
}
xmin <- 0
xmax <- 10
sigma <- 0.2
samp_size <- 100
xgrid <- seq(xmin, xmax, length.out = 1000)
X <- runif(samp_size, xmin, xmax)
Y <- rnorm(samp_size, fx(X), sigma)
plot(X, Y, pch = 16, xlab = 'X', ylab = 'Y')
lines(xgrid, fx(xgrid), col = cols[1], lwd = 2)
Let us try to see how the 1-nearest neighbour works.
fit <- knnreg(data.frame(X = X), Y, k = 1)
y_hat <- predict(fit, data.frame(X = xgrid))
plot(X, Y, pch = 16, xlab = 'X', ylab = 'Y')
lines(xgrid, y_hat, col = cols[2], lwd = 2)
This does not look right! It is basically fitting the noise that is inherent in the data. Low bias, high variance (with new data, prediction is going to be widely different).
How about if we increase \(k\) to \(10\)?
fit <- knnreg(data.frame(X = X), Y, k = 10)
y_hat <- predict(fit, data.frame(X = xgrid))
plot(X, Y, pch = 16, xlab = 'X', ylab = 'Y')
lines(xgrid, y_hat, col = cols[2], lwd = 2)
This seems about right!
And if we keep increasing \(k\) to \(40\)?
fit <- knnreg(data.frame(X = X), Y, k = 40)
y_hat <- predict(fit, data.frame(X = xgrid))
plot(X, Y, pch = 16, xlab = 'X', ylab = 'Y')
lines(xgrid, y_hat, col = cols[2], lwd = 2)
We are missing some structure of the data (evident at the boundaries). High bias, low variance.
If we have a lot of data and can afford to leave some data out, a simple approach might be to use some randomly selected training samples to build the model, and then to evaluate the error on the leftout test data. This can be done when we want to evaluate a set of possible models (e.g. different polynomial degrees). Let’s split the data in two sets:
It is difficult to give a general rule on how to choose the number of observations in each of the three parts, as this depends on the signal-to- noise ratio in the data and the training sample size. A typical split might be \(80\%\) for training, and \(20\%\) validation.
Let us load and plot the data.
rm(list=ls())
cols <- brewer.pal(9, "Set1")
plot(Auto$horsepower, Auto$mpg, pch = 16, xlab = 'horsepower', ylab = 'mpg')
Let us now create the splits of the data.
n <- nrow(Auto)
set.seed(1234)
train_idx <- sample(1:n, floor(0.8 * n))
test_idx <- (1:n)[-train_idx]
And fit the different proposed models to the training data:
d_max <- 12
val_err <- array(NA, dim = d_max)
for (d in 1:d_max){
glm_fit <- glm(mpg ~ poly(horsepower, d), data = Auto[train_idx,])
y_hat <- predict(glm_fit, newdata = Auto[test_idx,])
val_err[d] <- mean((y_hat - Auto$mpg[test_idx])^2)
}
which.min(val_err)
[1] 2
Choose the model such that the validation error is minimum, i.e. \(d = 2\).
glm
packageglm_fit <- glm(mpg ~ horsepower, data = Auto) # a simple linear regression without specify the family
?cv.glm # this is the build in cv function for all glm models
cv.glm(Auto, glm_fit)$delta[1] # pretty slow (does not use formula (5.2) on page 180)
[1] 24.23151
Let’s write a simple function in order to use the magic formula that we saw in class for LOOCV, i.e.
\[CV_{(n)} = \frac{1}{n} \sum_{k=1}^n \text{Err}(y_i, \hat{y}_i)\]
loocv_lm <- function(fit){
h <- lm.influence(fit)$h # h is the leverage statistic, it is a vector with length n
# h is also the diagonal of the hat matrix H = X(X'X)^(-1)X':
# diag(X %*% solve(crossprod(X)) %*% t(X))
return (mean((residuals(fit) / (1-h))^2))
}
loocv_lm(glm_fit)
[1] 24.23151
loocv_error <- array(NA, dim = d_max)
MSE <- array(NA, dim = d_max)
for(d in 1:d_max){
glm_fit <- glm(mpg ~ poly(horsepower, d), data = Auto)
loocv_error[d] <- loocv_lm(glm_fit)
MSE[d] <- mean(residuals(glm_fit)^2)
}
plot(1:d_max, loocv_error, type = "b", ylim = c(min(MSE), max(loocv_error)),
pch = 16, xlab = 'degree', ylab = 'MSE', col = cols[1])
lines(1:d_max, MSE, pch = 16, type = "b", col = cols[2])
legend('topright', legend = c('LOOCV', 'Training'), col = cols[1:2], pch = 16)
\(10\)-fold cross validation
cv10_error <- array(NA, dim = d_max)
for(d in 1:d_max){
glm_fit <- glm(mpg ~ poly(horsepower, d), data = Auto)
cv10_error[d] <- cv.glm(Auto, glm_fit, K = 10)$delta[1]
}
plot(1:d_max, loocv_error, type = "b", pch = 16, xlab = 'degree', ylab = 'MSE', col = cols[1])
lines(1:d_max, cv10_error, pch = 16, type = "b", col = cols[3])
legend('topright', legend = c('LOOCV', '10 fold CV'), col = cols[c(1,3)], pch = 16)
We might need to use cross validation in more complex models, when the cv.glm()
function is not available. Let us implement it from scratch!
K <- 10 # number of folds
folds <- cut(sample(1:n), breaks = K, labels = FALSE)
folds
[1] 3 2 9 2 1 4 9 2 3 5 6 2 2 8 10 3 1 9 8 3 9 1 3 2 6
[26] 2 7 1 5 5 9 5 10 6 4 6 9 2 4 8 6 1 3 6 8 2 2 9 4 8
[51] 3 6 1 4 5 9 6 1 2 9 6 2 3 5 2 10 8 1 8 1 9 7 1 10 7
[76] 2 10 1 7 6 7 7 9 6 9 6 7 2 8 8 9 2 6 9 5 1 8 5 5 8
[101] 2 7 5 8 7 2 5 2 7 1 4 6 7 6 1 3 9 7 7 10 8 6 3 2 5
[126] 10 6 4 8 10 2 6 5 7 6 10 5 1 1 9 4 9 10 5 10 4 10 6 2 4
[151] 4 2 9 1 5 6 5 1 2 8 6 8 4 4 3 1 4 9 10 8 6 9 6 1 4
[176] 7 4 4 5 4 6 8 3 8 7 5 6 5 4 2 3 5 3 3 4 7 8 8 3 8
[201] 4 7 2 2 1 10 9 2 8 2 1 3 4 9 2 5 3 9 6 3 3 8 4 5 1
[226] 5 7 3 10 8 10 2 4 8 10 10 7 9 9 6 1 9 1 7 4 8 9 1 10 10
[251] 5 3 7 5 10 6 2 4 5 6 7 3 10 1 8 4 9 1 1 10 4 9 10 1 8
[276] 9 9 7 2 7 8 5 5 3 3 5 3 1 7 4 5 3 7 3 10 6 6 7 7 3
[301] 10 9 8 10 4 4 4 5 7 9 1 10 4 9 10 1 2 8 5 2 3 8 5 6 9
[326] 1 8 10 10 2 3 7 5 6 1 7 10 7 6 2 5 8 3 10 8 6 1 10 7 10
[351] 3 2 8 9 7 5 1 9 4 6 5 7 1 4 9 3 1 7 5 1 6 6 10 7 9
[376] 2 3 4 3 10 4 3 10 8 4 8 3 3 7 10 4 10
Look at the folds vector: each element corresponds to an observation and denotes the fold where that observation falls. We will iterate through the folds and take observations in that fold as test set, whereas all other observations are going to train the model.
Why did we use sample(1:n) in the call to the function cut()
? This is very important!
cv_error <- array(NA, dim = c(K, d_max))
for (d in 1:d_max){
for (i in 1:K){
test <- which(folds == i)
train <- which(folds != i)
glm_fit <- glm(mpg ~ poly(horsepower, d), data = Auto[train,])
cv_error[i,d] <- mean((Auto$mpg[test] - predict(glm_fit, newdata = Auto[test,]))^2)
}
}
plot(1:d_max, colMeans(cv_error), type = "b", pch = 16, xlab = 'degree',
ylab = 'CV MSE')
Many people use a more stringent criterion. Instead of considering the model with the minimum average CV error, they consider the most parsimonious model with average CV error that is within a standard deviation from the minimum average CV error.
We can compute standard errors for the CV error curve at each tuning parameter value \(d\). First define, for \(k = 1, \dots, K\): \[ CV_{k}(d) = \dfrac{1}{n_{k}} \sum_{i \in F_{k}} \{y_{i} - \hat{f}_{d}^{(-k)}(x_{i})\}^{2} \] where \(n_{k}\) is the number of points in the \(k^{th}\) fold. Then we compute the sample standard deviation of \(CV_{1}(d), \dots, CV_{K}(d)\), \[ \text{SD}(d) = \sqrt{\text{var}\{\text{CV}_{1}(d), \dots, \text{CV}_{K}(d)\}} \] Finally, we use \(\text{SE}(d) = \text{SD}(d)/\sqrt{K}\).
The one standard error rule is an alternative way of choosing \(d\) from the CV curve. We start with the usual estimate \[ \hat{d} = \text{argmin}_{d \in \{1, \dots, D_{max}\}} \text{CV}(d) \]
and we move \(d\) in the direction of increasing regularization until it ceases to be true that \[\text{CV}(d) \leq \text{CV}(\hat{d}) + \text{SE}(\hat{d})\] In words, we take the simplest (most regularized) model whose error is within one standard error of the minimal error.
cv_RMSE <- colMeans(cv_error)
sd_cv_RMSE <- apply(cv_error, 2, sd)/sqrt(K)
library(plotrix)
plotCI(x = 1:d_max, y = cv_RMSE, uiw = sd_cv_RMSE, col = 'red',
scol = 'gray', pch = 16, lwd = 2, xlab = 'degree',
ylab = 'CV MSE')
# Trace the line corresponding to the optimal model size
abline(v = which.min(cv_RMSE), lwd = 1, col = 1, lty = 2)
# Look for the indexes before the optimal one which have a mean which is still
# lower than the optimal MSE + one dev. std
abline(h = min(cv_RMSE) + sd_cv_RMSE[which.min(cv_RMSE)], lty = 2)
idx <- which( min(cv_RMSE) + sd_cv_RMSE[which.min(cv_RMSE)] >= cv_RMSE)
# Pick the first of these indexes (i.e. simpler model) which is TRUE
idx <- idx[1]
# Trace the line corresponding to the most conservative model among the ones
# with error within 1-std. dev of the optimal error
abline(v = idx, lwd = 1, col = 1, lty = 3)
Generate \(n = 300\) observations from .
\[\begin{aligned} &x_1, \dots, x_n \sim \text{Unif}(0, 10) \\ &y_i \mid x_i \sim \mathcal{N}(\sin(x_i), 0.5) \end{aligned}\]
Determine the optimal \(k\) for k-NN regression applied to that data. Plot the result.
rm(list=ls())
cols <- brewer.pal(9, "Set1")
n <- 200
set.seed(1234)
X <- rnorm(n, 2, 3)
We then have a sample of estimates \((\tilde{X}^{(1)}, \dots, \tilde{X}^{(B)})\).
B <- 5000
mu_boot <- array(NA, dim = B)
for (b in 1:B){
idx_boot <- sample(1:n, n, replace = TRUE)
X_boot <- X[idx_boot]
mu_boot[b] <- median(X_boot)
}
xgrid <- seq(min(mu_boot), max(mu_boot), length.out = 100)
par(mar=c(4,4,2,2), family = 'serif')
hist(mu_boot, col = 'gray', border = 'white', probability = T, main = '', xlab = TeX("$\\mu"))
abline(v = median(X), lwd = 2, lty = 2)
Use Bootstrap to evaluate the variability of the regression coefficients for the model
\[ \begin{aligned} &y_{i} = - 1 + 2*x_{i} + \varepsilon_{i} \\ &x_{1}, \dots, x_{n} \sim N(2, 1^2) \\ &\varepsilon_{i} \sim N(0, 1^{2}) \end{aligned} \]
using \(n = 200\).